
class String

  # A fuzzy matching mechanism. Returns a score from 0-1,
  # based on the number of shared edges.
  # To be effective, the strings must be of length 2 or greater.
  #
  #   "Alexsander".fuzzy_match( "Aleksander" )  #=> 0.9
  #
  # The way it works:
  #
  # * Converts each string into a "graph like" object, with edges
  #     "alexsander" -> [ alexsander, alexsand, alexsan ... lexsand ... san ... an, etc ]
  #     "aleksander" -> [ aleksander, aleksand ... etc. ]
  # * Perform match, then remove any subsets from this matched set (i.e. a hit
  #   on "san" is a subset of a hit on "sander")
  #     Above example, once reduced -> [ ale, sander ]
  # * See's how many of the matches remain, and calculates a score based
  #   on how many matches, their length, and compare to the length of the
  #   larger of the two words.
  #--
  # Credit goes to Derek Lewis. Thanks Derek!
  # Still a bit rough. Any suggestions for improvement are welcome.
  #++
  def similarity( str_in )
    return 0 if str_in == nil
    return 1 if self == str_in

    # Make a graph of each word (okay, so its not a true graph, but is similar)
    graph_A = Array.new
    graph_B = Array.new

    # "graph" self
    last = self.length
    (0..last).each do |ff|
      loc  = self.length
      break if ff == last - 1
      wordB = (1..(last-1)).to_a.reverse!
      if (wordB != nil)
        wordB.each do |ss|
          break if ss == ff
          graph_A.push( "#{self[ff..ss]}" )
        end
      end
    end

    # "graph" input string
    last = str_in.length
    (0..last).each{ |ff|
      loc  = str_in.length
      break if ff == last - 1
      wordB = (1..(last-1)).to_a.reverse!
      wordB.each do |ss|
        break if ss == ff
        graph_B.push( "#{str_in[ff..ss]}" )
      end
    }


    # count how many of these "graph edges" we have that are the same
    matches = graph_A & graph_B
    #matches = Array.new
    #graph_A.each do |aa|
    #  matches.push( aa ) if( graph_B.include?( aa ) )
    #end

    # For eliminating subsets, we want to start with the smallest hits.
    matches.sort!{|x,y| x.length <=> y.length}

    # eliminate any subsets
    mclone = matches.dup
    mclone.each_index do |ii|
      reg = Regexp.compile( mclone[ii] )
      count = 0.0
      matches.each{|xx| count += 1 if xx =~ reg}
      matches.delete(mclone[ii]) if count > 1
    end

    score = 0.0
    matches.each{ |mm| score += mm.length }
    self.length > str_in.length ? largest = self.length : largest = str_in.length
    return score/largest
  end

  alias_method :fuzzy_match, :similarity

end


#  _____         _
# |_   _|__  ___| |_
#   | |/ _ \/ __| __|
#   | |  __/\__ \ |_
#   |_|\___||___/\__|
#
=begin test

  require 'test/unit'

  class TCString < Test::Unit::TestCase

    def test_similarity
      assert_nothing_raised { "Hello World!".similarity( "helo wrld" ) }
    end

  end

=end
